In [1]:
    
import pandas as pd
import numpy as np
    
In [7]:
    
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
    
In [311]:
    
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
    
In [312]:
    
def get_numpy_data(data_sframe, features, output):
    feature_matrix = data_sframe[features].values
    output_array = data_sframe[[output]].values
    return (feature_matrix, output_array)
    
In [313]:
    
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    return (features/norms, norms)
    
In [314]:
    
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
my_features = list(dtype_dict.keys()- ['id', 'date', 'zipcode', 'price'])
features_train,output_train = get_numpy_data(train, feature_list, 'price')
features_valid,output_valid = get_numpy_data(validation, feature_list, 'price')
features_test,output_test = get_numpy_data(test, feature_list, 'price')
    
In [315]:
    
set(my_features) - set(feature_list)
    
    Out[315]:
In [316]:
    
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms
    
In [317]:
    
print(features_test[0])
print(features_train[9])
    
    
In [318]:
    
import math
    
In [319]:
    
def get_distance(vec1, vec2):
    return math.sqrt(np.sum((vec1 - vec2)**2))
    
In [320]:
    
get_distance(features_test[0], features_train[9])
    
    Out[320]:
In [321]:
    
min_distance = None
closest_house = None
for i, train_house in enumerate(features_train[0:10]):
    dist = get_distance(features_test[0], train_house)
    if i == 0 or dist < min_distance:
        min_distance = dist
        closest_house = i
    
In [322]:
    
print(min_distance)
print(closest_house)
    
    
In [323]:
    
diff = features_train - features_test[0]
    
In [324]:
    
np.sum(diff[-1], axis=0)
    
    Out[324]:
In [325]:
    
dist = np.sqrt(np.sum(diff**2, axis=1))
    
In [326]:
    
dist[100]
    
    Out[326]:
In [327]:
    
def compute_distances(features_instances, features_query):
    diff = features_instances - features_query
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances
    
In [328]:
    
distances = compute_distances(features_train, features_test[2])
print(distances)
print(np.argmin(distances))
    
    
In [296]:
    
np.where(distances == min(distances))
    
    Out[296]:
In [297]:
    
distances[1149]
    
    Out[297]:
In [298]:
    
def k_nearest_neighbors(k, feature_train, features_query):
    distances = compute_distances(features_train, features_query)
    return distances, np.argsort(distances)[:k]
    
In [299]:
    
distances, neighbours = k_nearest_neighbors(4, features_train, features_test[2])
    
In [300]:
    
for n in neighbours:
    print(distances[n])
print(neighbours)
    
    
In [301]:
    
print(neighbours)
    
    
In [302]:
    
def predict_output_of_query(k, features_train, output_train, features_query):
    distances, neighbours = k_nearest_neighbors(k, features_train, features_query)
    prediction = output_train[neighbours].mean()
    return prediction
    
In [303]:
    
predict_output_of_query(1, features_train, output_train, features_test[2])
    
    Out[303]:
In [304]:
    
predict_output_of_query(4, features_train, output_train, features_test[2])
    
    Out[304]:
In [305]:
    
print(output_test[2])
    
    
In [306]:
    
def predict_output(k, features_train, output_train, features_query):
    #distances, neighbours = k_nearest_neighbors(k, features_train, features_query)
    predictions = np.zeros((features_query.shape[0], 1))
    for i in range(features_query.shape[0]):
        predictions[i,0] = predict_output_of_query(k,features_train, output_train, features_query[i])
    return predictions
    
In [307]:
    
predictions = predict_output(10, features_train, output_train, features_test[:10])
print(predictions)
print(np.argmin(predictions))
    
    
In [308]:
    
print(output_test[:10])
    
    
In [309]:
    
rsss = []
for k in range(1,16):
    predictions = predict_output(k, features_train, output_train, features_valid)
    error = predictions - output_valid
    rss = error.T.dot(error)
    print('RSS for k=%s: %s' % (k, rss))
    rsss.append(rss)
    
    
In [310]:
    
predictions = predict_output(3, features_train, output_train, features_test)
error = predictions - output_test
rss = error.T.dot(error)
print(rss)
    
    
In [ ]: